{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# 数据的导入\n",
    "import pandas as pd\n",
    "df = pd.read_csv('data/employees.csv')\n",
    "print(type(df))\n",
    "print(df.tail())\n",
    "print(df.salary.mean())\n",
    "# 数据的导出\n",
    "df = df.tail()\n",
    "df.to_csv('data/new.csv')"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# json\n",
    "df = pd.read_json('data/data1.json')\n",
    "print(type(df))\n",
    "\n",
    "import json\n",
    "with open('data/test.json') as f:\n",
    "    data = json.load(f)\n",
    "# print(data['users'])\n",
    "print(type(data))\n",
    "df = pd.DataFrame(data['users'])\n",
    "print(type(df))\n",
    "df"
   ],
   "id": "1468fc27208a3719",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 缺失值的处理\n",
    "# nan:not a number\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "s = pd.Series([12,25,np.nan, None, pd.NA])\n",
    "df = pd.DataFrame([[1,pd.NA,2],[2,3,5],[None,4,6]],columns=['第1列','第2列','第3列'])\n",
    "print(s)\n",
    "# 查看是否是缺失值\n",
    "print(s.isna())\n",
    "print(s.isnull())\n",
    "print(df.isna())\n",
    "print(df.isnull())\n",
    "print(df.isna().sum(axis=1))\n",
    "print(s.isna().sum()) #查看缺失值的个数\n",
    "\n",
    "# 剔除缺失值\n",
    "print(s.dropna())\n",
    "print('-'*30)\n",
    "print(df)\n",
    "print(df.dropna()) #剔除一整条的记录\n",
    "print(df.dropna(how='all')) #如果所有的值都是缺失值，删除这一行\n",
    "print(df.dropna(thresh=1)) #如果至少有n个值不是缺失值，就保留\n",
    "print(df.dropna(axis=1)) #剔除一整列的记录\n",
    "print(df.dropna(subset=['第1列'])) #如果某列有缺失值，则删除这一行\n",
    "\n",
    "# 填充缺失值\n",
    "df = pd.read_csv('data/weather_withna.csv')\n",
    "df.tail()\n",
    "df.isna().sum(axis=0)\n",
    "df.head()\n",
    "print(df.fillna({'temp_max':20,'wind':2.5}).tail()) #使用字典来填充\n",
    "print(df.fillna(df[['temp_max','wind']].mean()).tail()) #使用统计值来填充\n",
    "print(df.ffill().tail())#用前面的相邻值填充\n",
    "print(df.bfill().tail())#用后面的相邻值填充"
   ],
   "id": "a158d5940347c02d",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 时间数据的处理\n",
    "import pandas as pd\n",
    "d = pd.Timestamp('2015-02-28 10:22')\n",
    "d1 = pd.Timestamp('2015-02-28 13:22')\n",
    "print(d)\n",
    "print(type(d))\n",
    "print(\"年：\",d.year)\n",
    "print(\"月：\",d.month)\n",
    "print(\"日：\",d.day)\n",
    "print(d.hour, d.minute, d.second)\n",
    "print(\"季度：\",d.quarter)\n",
    "print(\"是否是月底：\",d.is_month_end)\n",
    "# 方法\n",
    "print(\"星期几：\",d.day_name())\n",
    "print(\"转换为天：\",d.to_period(\"D\"))\n",
    "print(\"转换为季度：\",d1.to_period(\"Q\"))\n",
    "print(\"转换为年度：\",d1.to_period(\"Y\"))\n",
    "print(\"转换为月度：\",d1.to_period(\"M\"))\n",
    "print(\"转换为周维度：\",d1.to_period(\"W\"))"
   ],
   "id": "97eff609851d56ad",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 字符串转换为日期类型\n",
    "a = pd.to_datetime('20150228')\n",
    "print(a)\n",
    "print(type(a))\n",
    "print(a.day_name())\n",
    "\n",
    "# dataFrame 日期转换\n",
    "df = pd.DataFrame({\n",
    "    'sales':[100,200,300],\n",
    "    'date':['20250601','20250602','20250603']\n",
    "})\n",
    "df['datetime'] = pd.to_datetime(df['date'])\n",
    "df\n",
    "print(df.info())\n",
    "print(type(df['datetime']))\n",
    "df['week']=df['datetime'].dt.day_name()\n",
    "df['datetime'].dt.year\n",
    "\n"
   ],
   "id": "706ad9c1fa725a06",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# csv 日期转换\n",
    "df = pd.read_csv('data/weather.csv',parse_dates=['date'])\n",
    "df.info()\n",
    "df['date'].dt.day_name()"
   ],
   "id": "fa00ff7c4f05009a",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 日期数据作为索引\n",
    "# df.set_index('date' , inplace=True)#设置原来的df的索引\n",
    "print(df.loc[\"2013-01\":\"2013-02\"])"
   ],
   "id": "6ea95365b288433c",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 时间间隔\n",
    "d1 = pd.Timestamp('2013-01-15')\n",
    "d2 = pd.Timestamp('2023-02-23')\n",
    "d3 = d2-d1\n",
    "print(type(d3))\n",
    "print(d3)"
   ],
   "id": "8073358ce0bff679",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df = pd.read_csv('data/weather.csv',parse_dates=['date'])\n",
    "df.info()\n",
    "df['delta'] = df['date'] - df['date'][0]\n",
    "df.set_index('delta',inplace=True)"
   ],
   "id": "b41301a845c765d8",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df\n",
    "print(df.loc['10 days':'20 days'])"
   ],
   "id": "11e8efb045dcdc20",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "days = pd.date_range(\"2025-07-03\",\"2026-02-09\",freq=\"W\")\n",
    "days = pd.date_range(\"2025-07-03\",periods=10,freq=\"QE\")\n",
    "print(days)"
   ],
   "id": "ca49f6bfb327544e",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df = pd.read_csv('data/weather.csv',parse_dates=['date'])\n",
    "# 重新采样\n",
    "df.set_index('date',inplace=True)"
   ],
   "id": "e0d18fd948d29dac",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df[ [\"temp_max\",\"temp_min\"]].resample(\"MS\").mean()",
   "id": "6eb7c287f41771d5",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df[ [\"temp_max\",\"temp_min\"]].resample(\"YE\").mean()",
   "id": "2904c28ec328f65e",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "data = {\n",
    "    \"name\":['alice','alice','bob','alice','jack','bob'],\n",
    "    \"age\":[26,25,30,25,35,30],\n",
    "    'city':['NY','NY','LA','NY','SF','LA']\n",
    "}\n",
    "df = pd.DataFrame(data)"
   ],
   "id": "f620beeaf06ba0ef",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df.duplicated() #一整条记录都是一样的，标记为重复，返回True\n",
    "df.drop_duplicates(subset=['name']) #根据指定列去重\n",
    "df.drop_duplicates(subset=['name'],keep='last') #保留最后一次出现的行"
   ],
   "id": "7e05fb32c93228b0",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 数据类型的转换\n",
    "df = pd.read_csv('data/sleep.csv')\n",
    "df.dtypes"
   ],
   "id": "f4edf5c911ab7225",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df['age'] = df['age'].astype('int16')",
   "id": "950c9b43e15b5e62",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df['gender'] = df['gender'].astype('category')",
   "id": "e505d854fac5f3ac",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df.gender",
   "id": "b2640cb7620574e4",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df['is_male'] = df['gender'].map({'Female':True,'Male':False})",
   "id": "79c0c863d7848719",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df.is_male",
   "id": "488493861a9ca586",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "#数据变形\n",
    "import pandas as pd\n",
    "data = {\n",
    "    'ID': [1, 2],\n",
    "    'name':['alice','bob'],\n",
    "    'Math': [90, 85],\n",
    "    'English': [88, 92],\n",
    "    'Science': [95, 89]\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "print(df)\n",
    "df.T   #行列转置\n",
    "# 宽表转换成长表\n",
    "df2 = pd.melt(df,id_vars=['ID','name'],var_name='科目',value_name='分数')\n",
    "df2.sort_values('name')\n",
    "print(df2)\n",
    "# 长表转宽表\n",
    "pd.pivot(df2,index=['ID','name'],columns='科目',values='分数')"
   ],
   "id": "17063d5d4b1aae3a",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "data = {\n",
    "    'ID': [1, 2],\n",
    "    'name':['alice smith','bob smith'],\n",
    "    'Math': [90, 85],\n",
    "    'English': [88, 92],\n",
    "    'Science': [95, 89]\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "# 分列\n",
    "df[['first','last']]  = df['name'].str.split(\" \",expand=True)\n",
    "df = pd.read_csv('data/sleep.csv')\n",
    "df = df[['person_id','blood_pressure']]\n",
    "df[['high','low']] = df['blood_pressure'].str.split('/',expand=True)\n",
    "df['high']=df['high'].astype('int64')\n",
    "df['low']=df['low'].astype('int64')\n",
    "df.info()\n",
    "df.high.mean()\n",
    "df.low.mean()"
   ],
   "id": "b763a3d784b45fbc",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 数据分箱 pd.cut(x,bins,labels)\n",
    "import pandas as pd\n",
    "df = pd.read_csv('data/employees.csv')\n",
    "df.head(10)"
   ],
   "id": "1a2ef4ba0754554",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df1 = df.head(10)[['employee_id','salary']]\n",
    "df1"
   ],
   "id": "c7c26528fccff394",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "pd.cut(df1['salary'],bins=3) #bins=n，分成n段区间，起始值、结束值是所有数据的最小值、最大值\n",
    "#4180~14100~24000\n",
    "pd.cut(df1['salary'],bins=3).value_counts()\n",
    "pd.cut(df1['salary'],bins=[0,10000,20000,30000])#bins=list，分成n段区间\n",
    "pd.cut(df1['salary'],bins=[0,10000,20000,30000]).value_counts()\n",
    "df1['收入范围'] =pd.cut(df1['salary'],bins=[0,10000,20000,30000],labels=['低','中','高'])#bins=list，分成n段区间\n",
    "pd.qcut(df1['salary'],3).value_counts()"
   ],
   "id": "c2d51d480f6343b3",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 睡眠数据\n",
    "df = pd.read_csv('data/sleep.csv')\n",
    "df1 = df.head(10)[['person_id','sleep_quality']]\n",
    "df1\n",
    "df['睡眠质量'] = pd.cut(df['sleep_quality'],bins=3,labels=\n",
    "                         ['差','中','优'])\n",
    "df['睡眠质量'].value_counts()\n",
    "df.head(10)\n",
    "df['gender']=df['gender'].astype('category')\n",
    "df['gender'].value_counts()\n",
    "# 字符串-->类别-->统计\n",
    "# 数值-->分箱-->统计\n",
    "print(df['gender'].dtype)\n",
    "print(df['睡眠质量'].dtype)"
   ],
   "id": "9cebcb76aa2fff6d",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# df.rename()   df.set_index()  df.reset_index()\n",
    "df = pd.DataFrame({\n",
    "    'name':['jack','alice','tom','bob'],\n",
    "    'age':[20,30,40,50],\n",
    "    'gender':['female','male','female','male']\n",
    "})\n",
    "df.set_index(\"name\",inplace=True)\n",
    "df.reset_index(inplace=True)\n",
    "df.rename(columns={\"age\":\"年龄\"},index={0:4})"
   ],
   "id": "108d8a8bcf4d7e66",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "df.index=[1,2,3,4]\n",
    "df.columns=[\"姓名\",'年龄',\"性别\"]\n",
    "df"
   ],
   "id": "e30c5c7d2afdfef0"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 分组聚合\n",
    "# df.groupby('分组的字段')['聚合的字段'].聚合函数()\n",
    "import pandas as pd\n",
    "df = pd.read_csv('data/employees.csv')\n",
    "df = df.dropna(subset=['department_id'])\n",
    "df['department_id'] = df['department_id'].astype('int64')\n",
    "# 计算不同部门的平均薪资\n",
    "df.groupby('department_id').groups #查看分组\n",
    "df.groupby('department_id').get_group(20) #查看具体的某个分组数据\n",
    "df2 = df.groupby('department_id')[['salary']].mean()\n",
    "df2['salary'] = df2['salary'].round(2)\n",
    "df2=df2.reset_index()\n",
    "df2.sort_values('salary',ascending=False)"
   ],
   "id": "c2c5120328e9b71a",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 计算不同部门不同岗位的人的平均薪资\n",
    "df2=df.groupby(['department_id','job_id'])[['salary']].mean()\n",
    "df2=df2.reset_index()\n",
    "df2['salary'] = df2['salary'].round(1)\n",
    "df2.sort_values('salary',ascending=False)"
   ],
   "id": "5f30449d08d176f4",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 企鹅数据分析\n",
    "# 1. 导入必要的库\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "# 2. 导入数据 喙\n",
    "df = pd.read_csv('data/penguins.csv')\n",
    "df.head(5)\n",
    "df.info()\n"
   ],
   "id": "54d58bec294a4cf8",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 3. 数据清洗\n",
    "# 缺失值的检查\n",
    "print(df.isna().sum())\n",
    "df.dropna(inplace=True)"
   ],
   "id": "cb3b299ccc2f3e03",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 4. 数据特征的构造\n",
    "df['sex'] = df['sex'].astype('category')\n",
    "df['bill_ratio'] = df['bill_length_mm']/df['bill_depth_mm']\n",
    "df.head()"
   ],
   "id": "86492d5a6f55da98",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 5. 数据分析\n",
    "# 数据分箱-把体重分为三个等级\n",
    "labels = ['低','中','高']\n",
    "df['mass_level'] = pd.cut(df['body_mass_g'],bins=3,labels=labels)\n",
    "print(df['mass_level'].value_counts())\n",
    "# 按岛屿、性别分组分析\n",
    "df.groupby(['sex','island']).agg({\n",
    "    'body_mass_g':['mean','count'],\n",
    "})"
   ],
   "id": "f5f4f54a0f09a0b",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 睡眠质量分析\n",
    "# 1.导入库\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "# 2.导入数据\n",
    "df = pd.read_csv('data/sleep.csv')\n",
    "df.head()\n",
    "df.info()\n",
    "df.describe()"
   ],
   "id": "59667e648578215",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 3.数据清洗\n",
    "df.isna().sum()\n",
    "df.drop(columns='sleep_disorder',inplace=True)"
   ],
   "id": "b37136eeef585ca8",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 4. 数据特征的构造\n",
    "df['gender'] = df['gender'].astype('category')\n",
    "df['occupation'] = df['occupation'].astype('category')\n",
    "df['bmi_category'] = df['bmi_category'].astype('category')\n",
    "df[['high','low']]=df['blood_pressure'].str.split('/',expand=True)\n",
    "\n",
    "# 睡眠质量的分箱\n",
    "labels = ['差','中','优']\n",
    "df['quality_level'] = pd.cut(df['sleep_quality'],bins=3,labels=labels)\n",
    "age_labels=['青少年','中年','老年']\n",
    "df['age_level'] = pd.cut(df['age'],bins=3,labels=age_labels)\n",
    "df.head()"
   ],
   "id": "61c4e82c83d1b8fb",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 5.数据的统计、分析\n",
    "print(df['bmi_category'].value_counts())"
   ],
   "id": "861623889ca88459",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 根据不同的bmi分组，睡眠质量\n",
    "df.groupby(['age_level','bmi_category']).agg({\n",
    "    'sleep_duration':'mean',\n",
    "    'sleep_quality':'mean',\n",
    "    'stress_level':'mean'\n",
    "})"
   ],
   "id": "92285424ef2d790d",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
